1. Unsupervised One-block analysis with Principal Component Analysis (PCA)

1.1 Library installation/loading

# Load libraries
library(RGCCA)
library(FactoMineR)
library(factoextra)
library(ggpubr)
library(corrplot)

1.2 Data Introduction

Starts by Slides to present the data-set, slides 36->42 of the “psy-Stat” presentation or use figures from Amazigh’s PhD manuscript.

1.3 Preliminary observation and manipulations

Starts by doing a simple PCA on gene expression matrix.

##############

# Starts from here 
data.train = readRDS(file = "data.train")
DNAm_covariates = readRDS("pd_mdd_bin.RDS")
Sex = as.factor(data.train$cli$Sex)
levels(Sex) = c("Female", "Male")

# PCA on mRNA with full data-set
pca_results <- PCA(data.train$mRNA, scale.unit=TRUE, ncp=25, graph=F) 

fviz_pca_ind(pca_results, axes = c(1,2))+ 
  geom_point(aes(colour = Sex)) +
  guides(colour = guide_legend(title = "color"))

==> There is a huge Sex effect. Openning on methods for Sex (or other batch) correction. For now, we will only keep one of the two sex –> Females (higest number of samples)

# Function taken from CHAMP to help interpret factors
drawheatmap <- function(svdPV.m, title = NULL, display_legend = FALSE) {
  myPalette <- c("darkred", "red", "orange", "pink", "white")
  breaks.v <- c(-10000, -10, -5, -2, log10(0.05), 0)
  image(x = 1:nrow(svdPV.m), y = 1:ncol(svdPV.m), z = log10(svdPV.m),
        col = myPalette, breaks = breaks.v, xlab = "", ylab = "",
        axes = FALSE, main = paste0("Interpretation of the Components-", title))
  axis(1, at = 1:nrow(svdPV.m), labels = paste("Comp-",
                                               1:nrow(svdPV.m), sep = ""), las = 2)
  suppressWarnings(axis(2, at = 1:ncol(svdPV.m), labels = colnames(svdPV.m),
                        las = 2))
  if(display_legend){
    legend(x = 17, y = 10, legend = c(expression("p < 1x" ~ 10^{-10}),
                                              expression("p < 1x" ~ 10^{-5}),
                                              "p < 0.01", "p < 0.05", "p > 0.05"),
         fill = c("darkred","red", "orange", "pink", "white"), par("usr")[2],
         par("usr")[4], xpd = NA)
  }
}

check_association_CHAMP <- function(x, y){
  if(!is.numeric(y)){
    return(kruskal.test(x ~ as.factor(y))$p.value)
  }else{
    return(summary(lm(x ~ y))$coeff[2, 4])
  }
}

1.4 Interpretation of the PCA components

Once we are limited to Females, let’s look back at PCA. To begin with, we start with only two blocks for the sake of simplicity (mRNA and DNAm). In order to better understand what the different PCA components represent, lets us borrow a technique from the R package CHAMP (TODO : Slides to detail this techniques that should apply an F-test for continuous covariates and a Kruskal-Wallis test for qualitative covariates).

Figure : Components are interpreted in three cases : mRNA, DNAm and the concatenation of mRNA & DNAm. The last case is to see what a naïve integration (early integration strategy) can achieve and what are the limitations. Multiple remarks :

  • mRNA : Main effect of interest : the link with “Sample_group” (Patient(MDD)/Control) appears mainly in components 6 & 7. Though not the strongest link (for these component) each time.
  • DNAm : No PC at all link to “Sample_group”.
  • mRNA & DNAm : link to “Sample_group” stays in PC7 and seems strengthened (its is the strongest link for this component!), which encourages integration. Drawbacks : (i) several strong other links are also described for this component and (ii) there isn’t any control over the influence of each omic to contribute for a component. Indeed lets us look at the contribution of each variables.
# Keep only female samples
data.train.female = lapply(data.train, 
                           function(x) x[which(data.train$cli$Sex == F), ])

DNAm_covariates_female = DNAm_covariates[match(rownames(data.train.female$mRNA), 
                                               DNAm_covariates$Sample_ID), ]

covariates_explored = c("Sample_Group", "BMI", "BMI.bin", "Age", "Age.bin", 
                        "Age_bin", "Array", "Slide", "CD4", "CD8", "MO", "B", 
                        "NK", "GR")

DNAm_covariates_explored_female = 
  as.data.frame(DNAm_covariates_female[, covariates_explored])

# mRNA
pca_female_mRNA <- PCA(data.train.female$mRNA, scale.unit=TRUE, ncp=20, graph=F) 

interp_pca_female_mRNA = apply(pca_female_mRNA$ind$coord, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

# DNAm
pca_female_DNAm <- PCA(data.train.female$DNAm, scale.unit=TRUE, ncp=20, graph=F) 

interp_pca_female_DNAm = apply(pca_female_DNAm$ind$coord, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

# mRNA & DNAm
pca_female_mRNA_DNAm <- PCA(cbind(data.train.female$mRNA, 
                                  data.train.female$DNAm), scale.unit=TRUE, 
                            ncp=20, graph=F) 

interp_pca_female_mRNA_DNAm = apply(pca_female_mRNA_DNAm$ind$coord, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

par(mfrow = c(3, 1), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_pca_female_mRNA), title = "Block mRNA")
drawheatmap(t(interp_pca_female_DNAm), title = "Block DNAm", display_legend = T)
drawheatmap(t(interp_pca_female_mRNA_DNAm), title = "Block mRNA & DNAm")

Figure : Contribution of variables for Comp 7 for mRNA and mRNA+DNAm. Remark :

  • If we focus on the Top 100 contributions, it is dominated by mRNA.
  • The components of interest explains very little percentage of the variance.

=> It might be interesting to look for methods that treat each omic as their own and allow to separately interpret the influence of each omics into a “signal” of interest.

weights_pca_female_mRNA_DNAm = fviz_contrib(pca_female_mRNA_DNAm, 
                                            choice = "var", axes = 7, 
                                            top = 100) + 
  labs(title =paste0("Block mRNA & DNAm - Contribution of variables to Dim-7", 
                     "\nVariance Explained : ", pca_female_mRNA_DNAm$eig[7, 2]))
weights_pca_female_mRNA = fviz_contrib(pca_female_mRNA, choice = "var", 
                                            axes = 7, top = 100) + 
  labs(title =paste0("Block mRNA - Contribution of variables to Dim-7", 
                     "\nVariance Explained : ", pca_female_mRNA$eig[7, 2]))
ggarrange(weights_pca_female_mRNA, weights_pca_female_mRNA_DNAm,
                  ncol = 1)

1.5 Another eye on PCA

TODO : Slides to present the criterion of PCA and interpret it in term of variance of the projected cloud of points (if possible, use the code form TD 12 on PCA to interactively see what the PCA solution represent).

2. Unsupervised Two-blocks analysis with Partial Least Squares (PLS) and Canonical Correlation Analysis (CCA)

2.1 Presentation of PLS and CCA

Continue with slides 6 (only with 2 bloks for now), 9 et 10 (without R-CCA each time) in presentation “Psy_Stats”.

TODO : I tried to redo the example of slide 10 mentioned above but it is really not working well. I will do it theoretically again.

# corrplot.mixed(cor(cbind(data.train.female$mRNA[, c(1512, 1708)], 
#                              "cg20228636" = data.train.female$DNAm[, 1744])), number.digits = 2)
# 
# test_CCA = rgcca(blocks = list(mRNA = data.train.female$mRNA[, c(1512, 1708)], 
#                                cg20228636 = data.train.female$DNAm[, 1744]), 
#                  method = "CCA", verbose = F, ncomp = 1)
# 
# test_PLS = rgcca(blocks = list(mRNA = data.train.female$mRNA[, c(1512, 1708)], 
#                                cg20228636 = data.train.female$DNAm[, 1744]), 
#                  method = "PLS", verbose = F, ncomp = 1)
# 
# test_PCA = PCA(data.train.female$mRNA[, c(1512, 1708)], 
#                scale.unit=F, ncp=1, graph=F)
# 
# df = data.frame(test_PLS $blocks$mRNA,
#                 test_PLS$blocks$cg20228636)
# 
# ggplot(df, aes(ENSG00000204644, ENSG00000230795)) +
#   geom_point(aes(colour = cg20228636),size = 3) +
#   scale_colour_gradient2() + theme(aspect.ratio=1)+
#   geom_segment(aes(x=0, xend=-test_PLS$a$mRNA[1], 
#                    y=0, yend=-test_PLS$a$mRNA[2]), 
#                arrow = arrow(length = unit(0.5, "cm")), size = 1) +
#   geom_segment(aes(x=0, xend=-test_CCA$a$mRNA[1]/norm(test_CCA$a$mRNA, type = "2"), 
#                    y=0, yend=-test_CCA$a$mRNA[2]/norm(test_CCA$a$mRNA, type = "2")), 
#                arrow = arrow(length = unit(0.5, "cm")), size = 1) +
#   geom_segment(aes(x=0, xend=test_PCA$var$coord[1]/norm(test_PCA$var$coord, type = "2"), 
#                    y=0, yend=test_PCA$var$coord[2]/norm(test_PCA$var$coord, type = "2")), 
#                arrow = arrow(length = unit(0.5, "cm")), size = 1)
# 
# 
# 
# 
# 
# par(mfrow = c(1, 1))
# miRNA_group_female = apply(data.train.female$miRNA, 2, function(x) t.test(x = x[which(data.train.female$cli$Group == "Patient")], y = x[which(data.train$cli$Group == "Control")])$p.value)
# 
# mRNA_group_female = apply(data.train.female$mRNA, 2, function(x) t.test(x = x[which(data.train.female$cli$Group == "Patient")], y = x[which(data.train$cli$Group == "Control")])$p.value)
# 
# 
# test_CCA = rgcca(blocks = list(mRNA = data.train.female$mRNA[, c(names(sort(mRNA_group_female, decreasing = F))[1:1], names(sort(mRNA_group_female, decreasing = T))[1:1])], miRNA = data.train$miRNA[, names(sort(miRNA_group_female, decreasing = F))[1:1]]), method = "CCA", verbose = F, ncomp = 1)
# 
# test_PLS = rgcca(blocks = list(mRNA = data.train.female$mRNA[, c(names(sort(mRNA_group_female, decreasing = F))[1:1], names(sort(mRNA_group_female, decreasing = T))[1:1])], miRNA = data.train.female$miRNA[, names(sort(miRNA_group_female, decreasing = F))[1:1]]), method = "PLS", verbose = F, ncomp = 1)
# 
# plot(test_CCA$blocks$mRNA, col = as.factor(data.train$cli$Group), pch = 15, asp = 1)
# lines(c(0, test_PLS$a$mRNA[1]), c(0, test_PLS$a$mRNA[2]))
# lines(c(0, test_CCA$a$mRNA[1]/norm(x = test_CCA$a$mRNA, type = "2")), 
#       c(0, test_CCA$a$mRNA[2]/norm(x = test_CCA$a$mRNA, type = "2")))
# 
# 
# 
# 
# 
# "ENSG00000153234"
# "ENSG00000100288"
# "ENSG00000204644"

2.2 Application on the MDD dataset

So Now, lets us apply PLS and CCA that we have just learnt about to the MDD data-set.

Figure : Interpretation of the 10 components for :

  • mRNA components associated with PLS
  • DNAm components associated with PLS
  • mRNA components associated with CCA
  • DNAm components associated with CCA

The interpretation of the components for CCA seems really alike, components must be really correlated.

set.seed(27) #favorite number
resampling = caret::createDataPartition(data.train.female$cli$Group, list = TRUE, p = 0.2)$Resample1
data.validation.female = lapply(data.train.female, function(x) x[-resampling, ])
data.test.female = lapply(data.train.female, function(x) x[resampling, ])


PLS_female_mRNA_DNAm = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                           DNAm = data.train.female$DNAm), 
                             connection = matrix(1, 2, 2) - diag(2), 
                             tau = c(1, 1), 
                             scheme = "horst", verbose = F, ncomp = 10)

CCA_female_mRNA_DNAm = rgcca(blocks = list(mRNA = data.validation.female$mRNA, 
                                           DNAm = data.validation.female$DNAm), 
                             connection = matrix(1, 2, 2) - diag(2), 
                             tau = c(0, 0), 
                             scheme = "horst", verbose = F, ncomp = 10)


interp_PLS_female_mRNA = apply(PLS_female_mRNA_DNAm$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_PLS_female_DNAm = apply(PLS_female_mRNA_DNAm$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_CCA_female_mRNA = apply(CCA_female_mRNA_DNAm$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female[-resampling, ], 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_CCA_female_DNAm = apply(CCA_female_mRNA_DNAm$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female[-resampling, ], 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

par(mfrow = c(2, 2), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_PLS_female_mRNA), title = "PLS - Block mRNA")
drawheatmap(t(interp_PLS_female_DNAm), title = "PLS - Block DNAm")
drawheatmap(t(interp_CCA_female_mRNA), title = "CCA - Block mRNA")
drawheatmap(t(interp_CCA_female_DNAm), title = "CCA - Block DNAm")

2.3 The need to regularize

Figure : Correlation between components for CCA between the DNAm and mRNA block for a train set and a test set. On the train set component are perfectly pairwise correlated (and uncorrelated to their unmatching components), though this is not the case on the test set at all

=> This is explained by overfitting (TODO Slide to explain why overfitting)

# a = plot(CCA_female_mRNA_DNAm, type = "samples", comp = 1, bloc = 1, cex = 1.5)
# b = plot(CCA_female_mRNA_DNAm, type = "samples", comp = 2, bloc = 1, cex = 1.5)
# c = plot(CCA_female_mRNA_DNAm, type = "samples", comp = 3, bloc = 1, cex = 1.5)
# d = plot(CCA_female_mRNA_DNAm, type = "samples", comp = 4, bloc = 1, cex = 1.5)
# ggarrange(a, b, c, d , ncol = 2, nrow = 2)



CCA_female_mRNA_DNAm_validation = rgcca_transform(rgcca_res = CCA_female_mRNA_DNAm, 
                                                  blocks_test = list(mRNA = data.test.female$mRNA, 
                                                                     DNAm = data.test.female$DNAm))
colnames(CCA_female_mRNA_DNAm_validation$mRNA) = 
  colnames(CCA_female_mRNA_DNAm$Y$mRNA) = 
  paste0("mRNA - comp", 1:dim(CCA_female_mRNA_DNAm_validation$mRNA)[2])
colnames(CCA_female_mRNA_DNAm_validation$DNAm) = 
  colnames(CCA_female_mRNA_DNAm$Y$DNAm) = 
  paste0("DNAm - comp", 1:dim(CCA_female_mRNA_DNAm_validation$mRNA)[2])

par(mfrow = c(1, 2), omi = c(0, 0, 0, 0))
corrplot(Reduce(cor, CCA_female_mRNA_DNAm$Y), method = "number", main = "\n\n\n\nTrain Set")
corrplot(Reduce(cor, CCA_female_mRNA_DNAm_validation), method = "number",  main = "\n\n\n\nTest Set")

2.4 Presentation of Regularized CCA (R-CCA)

=> Introduce R-CCA first by completing slides 9 and 10 of presentation “Psy_STat”.

=> Now let us apply R-CCA to the data-set, with an arbitrary low value of tau (0.1)

2.5 Comparison of PLS and R-CCA on the MDD data-set

Figure : Interpretation of the 10 components for :

  • mRNA components associated with PLS
  • DNAm components associated with PLS
  • mRNA components associated with R-CCA
  • DNAm components associated with R-CCA

=> Differences ? -> As expected, R-CCA leads to components bearing the same meaning; whereas for PLS we have informations specific to each bloc (this is light, but differences are essentially in components 4/5/6)

=> GOOD NEWS : the DNAm now seems to bear a component that explains also the Patient/Control effect. Furthermore, this component seems to be more specific to the group effect than before (linked to only 2 other covariates, whereas before it was 3). Though it is still located at the 7 position (see next figure to see the corresponding % of variance explained).

CCA_female_mRNA_DNAm_regu = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                                DNAm = data.train.female$DNAm), 
                                  connection = matrix(1, 2, 2) - diag(2), 
                                  tau = c(0.1, 0.1), 
                                  scheme = "horst", verbose = F, ncomp = 10)

interp_CCA_female_mRNA_regu = apply(CCA_female_mRNA_DNAm_regu$Y$mRNA, 2,
                                    function(x) 
                                      sapply(DNAm_covariates_explored_female, 
                                             function(y) 
                                               check_association_CHAMP(x, y)))
interp_CCA_female_DNAm_regu = apply(CCA_female_mRNA_DNAm_regu$Y$DNAm, 2,
                                    function(x) 
                                      sapply(DNAm_covariates_explored_female, 
                                             function(y) 
                                               check_association_CHAMP(x, y)))
par(mfrow = c(2, 2), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_PLS_female_mRNA), title = "PLS - Block mRNA")
drawheatmap(t(interp_PLS_female_DNAm), title = "PLS - Block DNAm")
drawheatmap(t(interp_CCA_female_mRNA_regu), title = "R-CCA - Block mRNA")
drawheatmap(t(interp_CCA_female_DNAm_regu), title = "R-CCA - Block DNAm")

Figure : Let us look closer at what is inside component 7. For both methods, components are quite correlated and indeed seems do be catching a group effect. Then the other figures help interpreting what was used to compute these components (here I displayed only the 10 first variables for the sake of space). Then it is possible to try going deeper into the interpretation. For example “ENSG00000153234” is interesting (TODO : screenshot of ensembl page). This gene was already present in top genes of PCA.

==> But How can we add the miRNA block now ?

a = plot(PLS_female_mRNA_DNAm, type = "samples", response = data.train.female$cli$Group, cex = 1.5, comp = 7) + ylim(c(-1, 1)) + xlim(c(-1.5, 1.5)) + ggtitle("PLS\nSample space")
b = plot(CCA_female_mRNA_DNAm_regu, type = "samples", response = data.train.female$cli$Group, cex = 1.5, comp = 7)+ ylim(c(-1, 1))+ xlim(c(-1.5, 1.5)) + ggtitle("CCA\nSample space")
c = plot(PLS_female_mRNA_DNAm, type = "weights", comp = 7, bloc = 1, cex = 1.5, n = 10)
d = plot(CCA_female_mRNA_DNAm, type = "weights", comp = 7, bloc = 1, cex = 1.5, n = 10)
e = plot(PLS_female_mRNA_DNAm, type = "weights", comp = 7, bloc = 2, cex = 1.5, n = 10)
f = plot(CCA_female_mRNA_DNAm, type = "weights", comp = 7, bloc = 2, cex = 1.5, n = 10)
ggarrange(a, b, c, d, e, f, ncol = 2, nrow = 3)

3. Unsupervised analysis of an arbitrary number of blocks with Regularized Generalized Canonical Correlation Analysis (RGCCA)

3.1 Introducing RGCCA :

Slide 4 (with all the animations) of “DEFENSE_V2_bis” and maybe add slide 11 of “Psy_Stat” to have a different interpretation of tau. Then add slides 7 of “Psy_Stat” to see all methods encompass by RGCCA.

3.2 Application of RGCCA on the MDD dataset :

3.2.1 The impact of tau

Now let us apply RGCCA to our data-set. In order to be as close as possible to the methods above, let us apply it when :

  • All tau = 0.1 (close to CCA) or 1 (PLS)
  • Function g = identity (horst scheme)
  • All blocks are connected (a.k.a. Complete design)

Figure : Differences : once again RGCCA tau = 0.1 seems to have components describing the same information across all blocks. This is particularly true for the miRNA blocks that seems to have specific information when tau=1 (located essentially first components and first covariates).

=> GOOD NEWS : The components mainly linked to the group effect has raised in position (3 for tau=1 and 4 and 9 for tau = 0.1). This may be interpreted as the fact that this group effect is indeed common to each block and as such, is sooner/better catch by RGCCA. This has to be mitigated by the fact that a quite strong link to “MO” is also catch by this component in the mRNA block. Furthermore see next figure for the explained variance.

RGCCA_PLS_female = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                  miRNA = data.train.female$miRNA, 
                                  DNAm = data.train.female$DNAm), 
                    connection = matrix(1, 3, 3) - diag(3), 
                    tau = c(1,1,1), scheme = "horst", verbose = F, ncomp = 10)

interp_RGCCA_PLS_female_mRNA = apply(RGCCA_PLS_female$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_female_miRNA = apply(RGCCA_PLS_female$Y$miRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_female_DNAm = apply(RGCCA_PLS_female$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

RGCCA_CCA_female = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                  miRNA = data.train.female$miRNA, 
                                  DNAm = data.train.female$DNAm), 
                    connection = matrix(1, 3, 3) - diag(3), 
                    tau = c(0.1,0.1,0.1), scheme = "horst", verbose = F, ncomp = 10)

interp_RGCCA_CCA_female_mRNA = apply(RGCCA_CCA_female$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_CCA_female_miRNA = apply(RGCCA_CCA_female$Y$miRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_CCA_female_DNAm = apply(RGCCA_CCA_female$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

par(mfrow = c(3, 2), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_RGCCA_PLS_female_mRNA), title = "\nRGCCA - tau = 1\nBlock mRNA")
drawheatmap(t(interp_RGCCA_CCA_female_mRNA), title = "\nRGCCA - tau = 0.1\nBlock mRNA")
drawheatmap(t(interp_RGCCA_PLS_female_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_CCA_female_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_PLS_female_DNAm), title = "Block DNAm")
drawheatmap(t(interp_RGCCA_CCA_female_DNAm), title = "Block DNAm")

Figure : Though raised in position, this component of interest still seems to be explaining a low percentage of the total explained variance (TODO : Slide to explained the AVE in RGCCA).

a = plot(RGCCA_PLS_female, type = "ave", comp = 4) + ggtitle("RGCCA - tau = 1\nAverage Variance Explained")
b = plot(RGCCA_CCA_female, type = "ave", comp = 4) + ggtitle("RGCCA - tau = 0.1\nAverage Variance Explained")
ggarrange(a, b, nrow = 2, ncol  = 1)

3.2.2 The impact the connection matrix

What is the impact of the connection matrix. For example, as before all blocks were connected (a.k.a; complete design), it might seems a little too strong as an assumption. Each block might bear the group effect but not in the same way. For example, biologically, DNAm and miRNA directly impact the gene expression, but have lower interactions between one another. Let us try a second design matrix where all blocks are connected to the mRNA bock only (refered to as hierarchical design).

Figure : Interpretation of the components for each block in the hierarchical (left) or complete (right) design. Observations :

  • The component 3 is less clearly linked to the group effect for all blocks in the hierarchical design.
  • In the hierarchical design, block are less constrained to be linked (especially bloc miRNA and DNAm) and they start having specific information that i extracted (see first component of DNAm and comp6 of miRNA). This seems to have impacted the group effect in a bad way (less commonly retrieve across all blocks.)
RGCCA_PLS_H_female = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                  miRNA = data.train.female$miRNA, 
                                  DNAm = data.train.female$DNAm), 
                    connection = matrix(c(0, 1, 1, 1, 0, 0, 1, 0, 0), ncol = 3, byrow = T), 
                    tau = c(1,1,1), scheme = "horst", verbose = F, ncomp = 10)

interp_RGCCA_PLS_H_female_mRNA = apply(RGCCA_PLS_H_female$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_H_female_miRNA = apply(RGCCA_PLS_H_female$Y$miRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_H_female_DNAm = apply(RGCCA_PLS_H_female$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

RGCCA_PLS_C_female = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                  miRNA = data.train.female$miRNA, 
                                  DNAm = data.train.female$DNAm), 
                    connection = matrix(1, 3, 3) - diag(3), 
                    tau = c(1,1,1), scheme = "horst", verbose = F, ncomp = 10)

interp_RGCCA_PLS_C_female_mRNA = apply(RGCCA_PLS_C_female$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_C_female_miRNA = apply(RGCCA_PLS_C_female$Y$miRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_C_female_DNAm = apply(RGCCA_PLS_C_female$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

par(mfrow = c(3, 2), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_RGCCA_PLS_H_female_mRNA), title = "\nRGCCA - tau = 1 - Hierarchical\nBlock mRNA")
drawheatmap(t(interp_RGCCA_PLS_C_female_mRNA), title = "\nRGCCA - tau = 1 - Complete\nBlock mRNA")
drawheatmap(t(interp_RGCCA_PLS_H_female_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_PLS_C_female_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_PLS_H_female_DNAm), title = "Block DNAm")
drawheatmap(t(interp_RGCCA_PLS_C_female_DNAm), title = "Block DNAm")

3.2.3 The impact the scheme function

Lets us compare two schemes, when the g function equals the identity (a.k.a. Horst scheme), this is the scheme used since the beginning, and when the g function equals the square function (a.k.a. factorial scheme). The particular effect of the Factorial scheme is that it allows to catch high effect that are positively or negatively correlated between blocks (Horst allows only positively). It also have the effect of decreasing low links (covariance below 1) and increasing higher ones (covariance above 1)

Figure : Interpretation of the components for each block with the Horst (left) or Factorial (right) scheme. Observations :

  • The component 3 is less clearly linked to the group effect for all blocks in the Factorial scheme.
  • There are less pinks squares (lowest level of significance) when factorial scheme (43 vs 56).
RGCCA_PLS_C_Horst_female = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                  miRNA = data.train.female$miRNA, 
                                  DNAm = data.train.female$DNAm), 
                    connection = matrix(1, 3, 3) - diag(3), 
                    tau = c(1,1,1), scheme = "horst", verbose = F, ncomp = 10)

interp_RGCCA_PLS_C_Horst_female_mRNA = apply(RGCCA_PLS_C_Horst_female$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_C_Horst_female_miRNA = apply(RGCCA_PLS_C_Horst_female$Y$miRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_C_Horst_female_DNAm = apply(RGCCA_PLS_C_Horst_female$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

RGCCA_PLS_C_Factorial_female = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
                                  miRNA = data.train.female$miRNA, 
                                  DNAm = data.train.female$DNAm), 
                    connection = matrix(1, 3, 3) - diag(3), 
                    tau = c(1,1,1), scheme = "factorial", verbose = F, ncomp = 10)

interp_RGCCA_PLS_C_Factorial_female_mRNA = apply(RGCCA_PLS_C_Factorial_female$Y$mRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_C_Factorial_female_miRNA = apply(RGCCA_PLS_C_Factorial_female$Y$miRNA, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))
interp_RGCCA_PLS_C_Factorial_female_DNAm = apply(RGCCA_PLS_C_Factorial_female$Y$DNAm, 2,
                               function(x) 
                                 sapply(DNAm_covariates_explored_female, 
                                        function(y) 
                                          check_association_CHAMP(x, y)))

par(mfrow = c(3, 2), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_RGCCA_PLS_C_Horst_female_mRNA), title = "\nRGCCA - tau = 1 - Complete - Horst\nBlock mRNA")
drawheatmap(t(interp_RGCCA_PLS_C_Factorial_female_mRNA), title = "\nRGCCA - tau = 1 - Complete - Factorial\nBlock mRNA")
drawheatmap(t(interp_RGCCA_PLS_C_Horst_female_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_PLS_C_Factorial_female_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_PLS_C_Horst_female_DNAm), title = "Block DNAm")
drawheatmap(t(interp_RGCCA_PLS_C_Factorial_female_DNAm), title = "Block DNAm")

3.3 How is it possible to set the parameters in an unsupervised experiment ?

TODO : Slide to explain permutation (make slide close to slide 47 or 57 in spy_stat presentation)

Now, let us apply this permutation to our situation in order to set tau (the RGCCA package also allows to optimize for the number of components). We set the RGCCA model to the complete design, with the factorial scheme (Horst were giving better results but we believe in the necessity to retrieve both highly positively or negatively correlated events). We choose to explore 20 values in the interval [1e-4; 0.5], sampled in a logspace manner. For the sake of computational time, each block and each component are associated with the same value of tau (though one value of tau can be specified for each combination). Furthermore, for the same reason, we limit ourselves to 6 components, as it seems that the relevant information is catch in these components. 30 permutations are performed

Figure : Best set of parameters is

  mRNA      miRNA       DNAm 

0.05315656 0.05315656 0.05315656

# Not run
# perm_test_PLS_3B = rgcca_permutation(blocks = list(mRNA = data.train.female$mRNA, 
#                                                    miRNA = data.train.female$miRNA, 
#                                                    DNAm = data.train.female$DNAm), 
#                                      connection = matrix(c(0, 1, 1, 1, 0, 1, 1, 1, 0), ncol = 3, byrow = T), 
#                                      par_value = set_tau, 
#                                      n_perms = 30, n_cores = 20, ncomp = 6)


permutation_RGCCA_C_Factorial_female = readRDS("permutation_RGCCA_C_Factorial_female")
class(permutation_RGCCA_C_Factorial_female) = "permutation"

set_tau = sapply(data.train.female[1:3], function(x) pracma:::logseq(x1 = 1e-4, x2 = 0.5, n = 20))

plot(permutation_RGCCA_C_Factorial_female)

Now we train the model associated with the optimal set of parameters and then we performed bootstrap in order to assess the robustness of the block weight vectors allowing for the computation of the block component vectors.

SLIDE : add slides explaining bootstrap (use slides 47 in the presentation “Psy_Stat” + TODO : slides explaining how p-values and confidence intervals are computed ?)

Figure : 2000 bootstrap samples were generated. We are looking at the first 100 weights (estimated on the regular cohort, wihout resampling) with their estimated Confidence Interval for components 1/3/4 (the last two because it is were the information was)

==> nothing is significant at all.

# NOT RUN
# RGCCA_C_Factorial_female_bestPerm = rgcca(blocks = list(mRNA = data.train.female$mRNA, 
#                                                         miRNA = data.train.female$miRNA, 
#                                                         DNAm = data.train.female$DNAm), 
#                                           connection = matrix(c(0, 1, 1, 1, 0, 1, 1, 1, 0), ncol = 3, byrow = T), 
#                                           tau = permutation_RGCCA_C_Factorial_female$best_params, scheme = "factorial", 
#                                           verbose = F, ncomp = 6)
# 
# boot_test_PLS_3B = rgcca_bootstrap(RGCCA_C_Factorial_female_bestPerm , n_cores = 20, n_boot = 2000)

bootstrap_RGCCA_C_Factorial_female_bestPerm = readRDS("bootstrap_RGCCA_C_Factorial_female_bestPerm")
class(bootstrap_RGCCA_C_Factorial_female_bestPerm) = "bootstrap"

a = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 1, comp = 1)
b = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 2, comp = 1)
c = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 3, comp = 1)
d = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 1, comp = 3)
e = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 2, comp = 3)
f = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 3, comp = 3)
g = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 1, comp = 4)
h = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 2, comp = 4)
i = plot(bootstrap_RGCCA_C_Factorial_female_bestPerm, n = 100, bloc = 3, comp = 4)
ggarrange(a, b, c, d, e, f, g, h, i, nrow = 3, ncol = 3)

==> What can we do to get out of this situation. Actually a lot of things :

  • I was really quick on the pre-processing and drop down all variables with NAs… actually, RGCCA can deal with them, it would be interesting to try with them
  • In the pre-processing steps, I kept only the first 2000 variables for DNAm and mRNA according to the Median Absolute Deviation (MAD; TODO, SLIDE to explain). At least I could have respected the ratio of variables across blocks (so take more variables for DNAm as it is originally composed of ~700.000 against ~16.000 for mRNA).
  • Parameters can be better better tuned. Here I forced that they are the same for all components and blocks.
  • Try some extensions of RGCCA that will be evoked at the end (especially Sparse GCCA to perform variable selection).
  • I Can remove batch effects and/or unwanted effects. Here we can see strong association with GR, Age, BMI… This is not taken into account
  • And… we can supervise the model

4. Supervised analysis of an arbitrary number of blocks with Regularized Generalized Canonical Correlation Analysis (RGCCA)

Here, our goal is to predict for Patient vs. Control, so we will supervise accordingly.

SLIDE : TODO add slides explaining how we can supervise with RGCCA. Distinguish the double supervision (both a supervising bloc and then a prediction model) over the single supervision (no supervising bloc but still prediction model based on the components).

In such situation, it is required to perform Cross-Validation in order to tune parameters to obtain the model with the best generalization capacity : cf. add slide 57 of “Psy_Stat” presentation.

So in the end, we cross-validated on the tau (for the same set of values as before). The prediction model used in the second step is a Linear Discriminant Analysis (LDA) model (TODO : Slide LDA ? At least in the general introduction on Statistics). TODO : Add slides to compare theoretically (and even computationally) DIABLO from the mixOmics package with RGCCA in this context. And then precise that mixOmics wrap RGCCA and that a lot of mixOmics functions are based on RGCCA (show screenshots).

We still select the complete design with the factorial scheme. Here, as we supervise, we are not forced to extract a high number of components and we can limit ourselves to 3 components per bloc (it is also less time consuming). Cross Validation was performed on the F1-score (TODO : Slide for Metric, in General Statistics ?) with 5 repetitions of a 5 folds CV. Furthermore, before performing the CV, we extracted 20% of the data-set as a test set to valiate our optimized model on the train set (optimized with CV). The resampling to separated train and test was performed with stratification (TODO : SLIDE for stratification ?).

Figure : the best set of extracted parameters is :

mRNA miRNA DNAm group

1e-04 1e-04 1e-04 0e+00

===> actually this setis on the borders of the grid.. Might need to explore more lower values.

resampling = readRDS(file = "resampling4CV")

data.validation.female = lapply(data.train.female, function(x) x[-resampling, ])
data.test.female = lapply(data.train.female, function(x) x[resampling, ])
  
# NOT RUN
# crossValidation_RGCCA_HiearchicalSupervised_Factorial_female <- rgcca_cv(blocks = list(mRNA = data.validation.female$mRNA, 
#                                                                                        miRNA = data.validation.female$miRNA, 
#                                                                                        DNAm = data.validation.female$DNAm, 
#                                                                                        group = data.validation.female$cli$Group),
#                                                                          response = 4, method = "rgcca",
#                                                                          par_type = "tau", par_value = cbind(set_tau, 0),
#                                                                          prediction_model = "lda", #caret::modelLookup()
#                                                                          metric = "F1",
#                                                                          k=5, n_run = 5,
#                                                                          verbose = TRUE, n_cores = 20, ncomp = c(3, 3, 3, 1))

crossValidation_RGCCA_HiearchicalSupervised_Factorial_female = readRDS("crossValidation_RGCCA_HiearchicalSupervised_Factorial_female.RDS")
class(crossValidation_RGCCA_HiearchicalSupervised_Factorial_female) = "cval"

plot(crossValidation_RGCCA_HiearchicalSupervised_Factorial_female)

The Best model is learnt and then applied on the test set. The F1 score on the test set is rather high, “0.7272727”, though, overfitting may still be present as the F1 score on the train set is “1”.

RGCCA_HiearchicalSupervised_Factorial_female_bestCV = rgcca(crossValidation_RGCCA_HiearchicalSupervised_Factorial_female)

predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV =
  rgcca_predict(rgcca_res = RGCCA_HiearchicalSupervised_Factorial_female_bestCV, 
                blocks_test = list(mRNA = data.test.female$mRNA, 
                                   miRNA = data.test.female$miRNA, 
                                   DNAm = data.test.female$DNAm, 
                                   group = data.test.female$cli$Group), 
                prediction_model = "lda", metric = "F1")
print(predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$metric)
## $train
##                       group
## Accuracy          1.0000000
## Kappa             1.0000000
## F1                1.0000000
## Sensitivity       1.0000000
## Specificity       1.0000000
## Pos_Pred_Value    1.0000000
## Neg_Pred_Value    1.0000000
## Precision         1.0000000
## Recall            1.0000000
## Detection_Rate    0.5797101
## Balanced_Accuracy 1.0000000
## 
## $test
##                       group
## Accuracy          0.6666667
## Kappa             0.3076923
## F1                0.7272727
## Sensitivity       0.8000000
## Specificity       0.5000000
## Pos_Pred_Value    0.6666667
## Neg_Pred_Value    0.6666667
## Precision         0.6666667
## Recall            0.8000000
## Detection_Rate    0.4444444
## Balanced_Accuracy 0.6500000
# pairs(Reduce(cbind, predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$projection), col = as.factor(data.test.female$cli$Group), pch = 15)

Figure : interpretation of the components for each bloc and on TRAIN or TEST sets. As expected by the supervision effect, the first component is highly linked to the group effect and this is the only effect explained. On the TEST set, this illustrates the overfitting. Though, as expected by the fact that the F1 score was high on the test set, there are still some relevant information extracted from the TRAIN set. The learnt model still seems to bear a certain generalization power (not as strong as expected on the TRAIN set obviously). Especially for the mRNA bloc, a little for the miRNA block and not at all for the DNAm block.

interp_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_mRNA = 
  apply(RGCCA_HiearchicalSupervised_Factorial_female_bestCV$Y$mRNA, 2,
        function(x) 
          sapply(DNAm_covariates_explored_female[-resampling, ], 
                 function(y) 
                   check_association_CHAMP(x, y)))
interp_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_miRNA =
  apply(RGCCA_HiearchicalSupervised_Factorial_female_bestCV$Y$miRNA, 2,
        function(x) 
          sapply(DNAm_covariates_explored_female[-resampling, ], 
                 function(y) 
                   check_association_CHAMP(x, y)))
interp_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_DNAm =
  apply(RGCCA_HiearchicalSupervised_Factorial_female_bestCV$Y$DNAm, 2,
        function(x) 
          sapply(DNAm_covariates_explored_female[-resampling, ], 
                 function(y) 
                   check_association_CHAMP(x, y)))
interp_predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_mRNA =
  apply(predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$projection$mRNA, 2,
        function(x) 
          sapply(DNAm_covariates_explored_female[resampling, ], 
                 function(y) 
                   check_association_CHAMP(x, y)))
interp_predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_miRNA =
  apply(predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$projection$miRNA, 2,
        function(x) 
          sapply(DNAm_covariates_explored_female[resampling, ], 
                 function(y) 
                   check_association_CHAMP(x, y)))
interp_predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_DNAm =
  apply(predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$projection$DNAm, 2,
        function(x) 
          sapply(DNAm_covariates_explored_female[resampling, ], 
                 function(y) 
                   check_association_CHAMP(x, y)))

par(mfrow = c(3, 2), omi = c(0, 0.7, 0, 0))
drawheatmap(t(interp_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_mRNA), title = "\nTRAIN SET _ RGCCA - tau Opt CV - Complete - Factorial\nBlock mRNA")
drawheatmap(t(interp_predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_mRNA), title = "\nTEST SET _ RGCCA - tau Opt CV - Complete - Factorial\nBlock mRNA")
drawheatmap(t(interp_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_miRNA), title = "Block miRNA")
drawheatmap(t(interp_predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_miRNA), title = "Block miRNA")
drawheatmap(t(interp_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_DNAm), title = "Block DNAm")
drawheatmap(t(interp_predictOnTest_RGCCA_HiearchicalSupervised_Factorial_female_bestCV_DNAm), title = "Block DNAm")

We can still perform a bootstrap procedure in order to evaluate the robustness of the weight and identify stable ones.

bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV = readRDS("bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV")
class(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV) = "bootstrap"


a = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 1, comp = 1)
b = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 2, comp = 1)
c = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 3, comp = 1)
d = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 1, comp = 2)
e = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 2, comp = 2)
f = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 3, comp = 2)
g = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 1, comp = 3)
h = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 2, comp = 3)
i = plot(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV, n = 100, bloc = 3, comp = 3)
ggarrange(a, b, c, d, e, f, g, h, i, nrow = 3, ncol = 3)

Bellow are the significant variables for each bock and each component. We can now try to perform enrichment analysis on each list associated with each component.

weights_idx = which(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$type == "weights")

print("#################   Component 1   #################")
## [1] "#################   Component 1   #################"
bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$var[which((p.adjust(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$pval[weights_idx], method = "fdr")<=0.05) & (bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$comp == 1))]
##   [1] "ENSG00000002726" "ENSG00000009790" "ENSG00000013441" "ENSG00000018280"
##   [5] "ENSG00000072952" "ENSG00000073756" "ENSG00000092094" "ENSG00000092871"
##   [9] "ENSG00000100288" "ENSG00000101236" "ENSG00000102908" "ENSG00000103502"
##  [13] "ENSG00000104960" "ENSG00000110344" "ENSG00000115523" "ENSG00000117281"
##  [17] "ENSG00000118960" "ENSG00000120709" "ENSG00000122435" "ENSG00000122642"
##  [21] "ENSG00000123689" "ENSG00000123700" "ENSG00000125347" "ENSG00000126746"
##  [25] "ENSG00000131368" "ENSG00000132155" "ENSG00000132522" "ENSG00000132825"
##  [29] "ENSG00000135976" "ENSG00000137947" "ENSG00000139631" "ENSG00000140396"
##  [33] "ENSG00000140526" "ENSG00000142396" "ENSG00000144655" "ENSG00000147813"
##  [37] "ENSG00000150045" "ENSG00000152926" "ENSG00000160321" "ENSG00000161960"
##  [41] "ENSG00000162413" "ENSG00000162517" "ENSG00000163154" "ENSG00000164620"
##  [45] "ENSG00000165548" "ENSG00000165792" "ENSG00000168970" "ENSG00000169403"
##  [49] "ENSG00000173068" "ENSG00000173114" "ENSG00000173334" "ENSG00000173825"
##  [53] "ENSG00000174837" "ENSG00000175538" "ENSG00000175793" "ENSG00000178951"
##  [57] "ENSG00000179388" "ENSG00000179933" "ENSG00000180879" "ENSG00000182108"
##  [61] "ENSG00000184602" "ENSG00000186130" "ENSG00000186594" "ENSG00000187037"
##  [65] "ENSG00000188186" "ENSG00000196313" "ENSG00000197530" "ENSG00000198695"
##  [69] "ENSG00000198712" "ENSG00000198763" "ENSG00000198888" "ENSG00000210082"
##  [73] "ENSG00000211459" "ENSG00000213190" "ENSG00000213462" "ENSG00000213694"
##  [77] "ENSG00000213983" "ENSG00000214655" "ENSG00000220804" "ENSG00000223551"
##  [81] "ENSG00000225972" "ENSG00000228253" "ENSG00000233038" "ENSG00000240225"
##  [85] "ENSG00000242125" "ENSG00000248323" "ENSG00000248527" "ENSG00000250510"
##  [89] "ENSG00000257878" "ENSG00000258682" "ENSG00000260007" "ENSG00000269352"
##  [93] "ENSG00000269688" "ENSG00000271254" "ENSG00000274008" "ENSG00000274213"
##  [97] "ENSG00000274536" "hsa-let-7d-3p"   "hsa-miR-142-3p"  "hsa-miR-16-2-3p"
## [101] "hsa-miR-19a-3p"  "hsa-miR-222-3p"  "hsa-miR-223-5p"  "hsa-miR-374b-5p"
## [105] "hsa-miR-4286"    "cg24984195"      "cg22992730"      "cg13375589"     
## [109] "cg13462158"      "cg18727742"      "cg23480021"      "cg22117893"     
## [113] "cg05946535"      "cg11203771"      "cg05745656"      "cg08200543"     
## [117] "cg07709590"      "cg16301894"      "cg21827317"      "cg13824270"     
## [121] "cg10833299"      "cg07275437"      "cg27572370"      "cg10751070"     
## [125] "cg11480278"      "cg27088726"      "cg08506353"      "cg19978674"     
## [129] "cg13831575"      "cg04414766"      "cg14768206"      "cg15019001"     
## [133] "cg07414487"      "cg22509179"      "cg23900144"      "cg17134397"     
## [137] "cg17828921"      "cg13132497"      "cg06068545"      "cg02920129"     
## [141] "cg26610739"      "cg18891815"      "cg17667591"      "cg06955432"     
## [145] "Patient"         "ENSG00000002726" "ENSG00000009790" "ENSG00000013441"
## [149] "ENSG00000018280" "ENSG00000072952" "ENSG00000073756" "ENSG00000092094"
## [153] "ENSG00000092871" "ENSG00000100288" "ENSG00000101236" "ENSG00000102908"
## [157] "ENSG00000103502" "ENSG00000104960" "ENSG00000110344" "ENSG00000115523"
## [161] "ENSG00000117281" "ENSG00000118960" "ENSG00000120709" "ENSG00000122435"
## [165] "ENSG00000122642" "ENSG00000123689" "ENSG00000123700" "ENSG00000125347"
## [169] "ENSG00000126746" "ENSG00000131368" "ENSG00000132155" "ENSG00000132522"
## [173] "ENSG00000132825" "ENSG00000135976" "ENSG00000137947" "ENSG00000139631"
## [177] "ENSG00000140396" "ENSG00000140526" "ENSG00000142396" "ENSG00000144655"
## [181] "ENSG00000147813" "ENSG00000150045" "ENSG00000152926" "ENSG00000160321"
## [185] "ENSG00000161960" "ENSG00000162413" "ENSG00000162517" "ENSG00000163154"
## [189] "ENSG00000164620" "ENSG00000165548" "ENSG00000165792" "ENSG00000168970"
## [193] "ENSG00000169403" "ENSG00000173068" "ENSG00000173114" "ENSG00000173334"
## [197] "ENSG00000173825" "ENSG00000174837" "ENSG00000175538" "ENSG00000175793"
## [201] "ENSG00000178951" "ENSG00000179388" "ENSG00000179933" "ENSG00000180879"
## [205] "ENSG00000182108" "ENSG00000184602" "ENSG00000186130" "ENSG00000186594"
## [209] "ENSG00000187037" "ENSG00000188186" "ENSG00000196313" "ENSG00000197530"
## [213] "ENSG00000198695" "ENSG00000198712" "ENSG00000198763" "ENSG00000198888"
## [217] "ENSG00000210082" "ENSG00000211459" "ENSG00000213190" "ENSG00000213462"
## [221] "ENSG00000213694" "ENSG00000213983" "ENSG00000214655" "ENSG00000220804"
## [225] "ENSG00000223551" "ENSG00000225972" "ENSG00000228253" "ENSG00000233038"
## [229] "ENSG00000240225" "ENSG00000242125" "ENSG00000248323" "ENSG00000248527"
## [233] "ENSG00000250510" "ENSG00000257878" "ENSG00000258682" "ENSG00000260007"
## [237] "ENSG00000269352" "ENSG00000269688" "ENSG00000271254" "ENSG00000274008"
## [241] "ENSG00000274213" "ENSG00000274536" "hsa-let-7d-3p"   "hsa-miR-142-3p" 
## [245] "hsa-miR-16-2-3p" "hsa-miR-19a-3p"  "hsa-miR-222-3p"  "hsa-miR-223-5p" 
## [249] "hsa-miR-374b-5p" "hsa-miR-4286"    "cg24984195"      "cg22992730"     
## [253] "cg13375589"      "cg13462158"      "cg18727742"      "cg23480021"     
## [257] "cg22117893"      "cg05946535"      "cg11203771"      "cg05745656"     
## [261] "cg08200543"      "cg07709590"      "cg16301894"      "cg21827317"     
## [265] "cg13824270"      "cg10833299"      "cg07275437"      "cg27572370"     
## [269] "cg10751070"      "cg11480278"      "cg27088726"      "cg08506353"     
## [273] "cg19978674"      "cg13831575"      "cg04414766"      "cg14768206"     
## [277] "cg15019001"      "cg07414487"      "cg22509179"      "cg23900144"     
## [281] "cg17134397"      "cg17828921"      "cg13132497"      "cg06068545"     
## [285] "cg02920129"      "cg26610739"      "cg18891815"      "cg17667591"     
## [289] "cg06955432"      "Patient"
print("#################   Component 2   #################")
## [1] "#################   Component 2   #################"
bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$var[which((p.adjust(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$pval[weights_idx], method = "fdr")<=0.05) & (bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$comp == 2))]
## [1] "ENSG00000161960" "ENSG00000163154" "Patient"         "ENSG00000161960"
## [5] "ENSG00000163154" "Patient"
print("#################   Component 3   #################")
## [1] "#################   Component 3   #################"
bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$var[which((p.adjust(bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$pval[weights_idx], method = "fdr")<=0.05) & (bootstrap_RGCCA_HiearchicalSupervised_Factorial_female_bestCV$stats$comp == 3))]
## [1] "Patient" "Patient"

5. Going futher with RGCCA

Introduce extension of RGCCA (TODO : SLIDED for each) :

  • SGCCA and more structured variable selection processes ?
  • MGCCA/TGCCA
  • Multi-group RGCCA ?
  • FGCCA ?
  • KGCCA ?

If I have to spend time on one of them, it is definitely SGCCA.

6. MOFA

Slides to present theory behind MOFA.

library(data.table)#
library(ggplot2)#
library(gage)#
## 
library(MOFA2)#
## 
## Attaching package: 'MOFA2'
## The following object is masked from 'package:stats':
## 
##     predict
# library(MOFAdata)#
library(DT)#
## Warning: package 'DT' was built under R version 4.1.3
library(msigdbr)#
## Warning: package 'msigdbr' was built under R version 4.1.3

Create MOFA object

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Create the MOFA object.                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

#select only women
selectedPatients=data.train$cli$Name[which(data.train$cli$Sex=="FALSE")]

multiview.norm.mat=list()
multiview.norm.mat$clinical=NULL
multiview.norm.mat$methylation=t(data.train$DNAm[selectedPatients,])
multiview.norm.mat$mirna=t(data.train$miRNA[selectedPatients,])
multiview.norm.mat$mrna=t(data.train$mRNA[selectedPatients,])

#they will be renamed
MOFAobject <- create_mofa(multiview.norm.mat)


#Plot overview of the input data, including the number of samples, views, features, and the missing assays.
plot_data_overview(MOFAobject) # everything is matching here

MOFA options

First, we have to define the options, they are classified in 3 categories :

  • Data options
  • Model options
  • Training options

Data options

Data options, important arguments:

  • scale_views: logical indicating whether to scale views to have the same unit variance. As long as the scale differences between the views is not too high, this is not required. Default is FALSE.
  • scale_groups: logical indicating whether to scale groups to have the same unit variance. As long as the scale differences between the groups is not too high, this is not required. Default is FALSE. (Groups are introduced in MOFA+)

You can also change the name of views and groups through the ‘data_options’ field of the MOFA object.

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                                 MOFA options                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Data options
###################################
data_opts <- get_default_data_options(MOFAobject)
# data_opts$scale_views <- TRUE
data_opts
## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] FALSE
## 
## $views
## [1] "methylation" "mirna"       "mrna"       
## 
## $groups
## [1] "group1"

Model options

Model options, important arguments:

  • likelihoods: type of data per view (options are “gaussian”, “poisson”, “bernoulli”). ‘gaussian’ is for continuous data, ‘bernoulli’ for binary data and ‘poisson’ for count/discrete data. By default they are inferred automatically… normally.
  • num_factors: number of factors

You can also specified through ‘model_options’ the use of sparsity priors on the weight and factor matrices (W and H). And which type of prior to use (ARD, spike-and-slab, or both). In the original MOFA approach, sparsity is only applied on W (ARD + spike-and-slab). In MOFA+ sparsity prior is also applied to the factor matrix H considering several groups. Corresponding fields are : spikeslab_factors, spikeslab_weights, ard_factors and ard_weights.

\(~\)

The model flexibly handles different data types; it is important to choose the likelihoods corresponding to the data (you can visually check data distribution).

For the number of factors, you can start with default value ~10 and further use dedicated packages such as ButchR method to define this number based on statistical criterion. If this number is too high, MOFA will remove unnecessary factors.

## Model options
###################################
model_opts <- get_default_model_options(MOFAobject)
model_opts$spikeslab_weights <- TRUE
model_opts$num_factors <- 10  
model_opts
## $likelihoods
## methylation       mirna        mrna 
##  "gaussian"  "gaussian"  "gaussian" 
## 
## $num_factors
## [1] 10
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] TRUE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

Visual inspection of mRNA data

hist(MOFAobject@data$mrna$group1, breaks = 100, main = "Histogram of mRNA values stored in MOFA object")

Visual inspection of miRNA data

hist(MOFAobject@data$mirna$group1 , breaks = 100, main = "Histogram of miRNA values stored in MOFA object")

Visual inspection of methylation data

hist(MOFAobject@data$methylation$group1 , breaks = 100, main = "Histogram of methylation values stored in MOFA object")

Open question: Should we model all these data with a gaussian distribution?

to change this option, you can use model_opts$likelihoods <- “gaussian” or “poisson” or “bernoulli”

#for example
#model_opts$likelihoods["methylation"] <- "bernoulli"

Training options

Training important options:

  • maxiter: maximum number of iterations (default is 1000).

  • convergence_mode: “fast”, “medium”, “slow”. For exploration, the fast mode is good enough.

  • startELBO: initial iteration to compute the ELBO (the objective function used to assess convergence)

  • freqELBO: frequency of computations of the ELBO (the objective function used to assess convergence)

  • drop_factor_threshold: minimum fraction of explained variance criteria to drop factors while training (eg 0.01 for 1%). Default is -1 (no dropping of factors). One can change it to discard factors that explain a low percentage of variance in all views.

  • stochastic: logical indicating whether to use stochastic variational inference (only required for very big data sets, introduced in MOFA+, default is FALSE).

  • seed : numeric indicating the seed for reproducibility (default is 42).

## Training options
###################################
train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "slow"
train_opts$freqELBO <- 1
train_opts$seed <- 42
#train_opts$drop_factor_threshold <- 0.001

train_opts
## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "slow"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 1
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

Prepare the object and run MOFA

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Prepare the MOFA object                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
## Checking data options...
## Checking training options...
## Checking model options...
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                                  Run MOFA                                  ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

MOFAobject <- run_mofa(MOFAobject, outfile="MOFA2_train.hdf5", use_basilisk = TRUE) 
## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

You can load a precomputed MOFA model with the load_Model() function with an hdf5 file.

Results interpretation

Get familiar with the MOFA object

Slots of MOFA object

MOFAobject is a S4 class used to store all relevant data to analyse a MOFA model. Get available slots

slotNames(MOFAobject)
##  [1] "data"               "covariates"         "covariates_warped" 
##  [4] "intercepts"         "imputed_data"       "interpolated_Z"    
##  [7] "samples_metadata"   "features_metadata"  "expectations"      
## [10] "training_stats"     "data_options"       "model_options"     
## [13] "training_options"   "stochastic_options" "mefisto_options"   
## [16] "dimensions"         "on_disk"            "dim_red"           
## [19] "cache"              "status"

Object slots are accessible using @ or $

  • data : input data (either a list of matrices or a MultiAssayExperiment)
  • intercepts : feature intercepts
  • imputed_data: imputed data (filled by running impute on the object, which is not necessary)
  • samples_ and features_metadata (basically groups)
  • expectations: expectations of the factors and weights ($Z and $W matrices)
  • training_stats: statistics about iterative training process (time, elbo..)
  • data_options: data processing options (given as parameters)
  • model_options: model options (given as parameters)
  • training_options: training options (given as parameters)
  • stochastic_options: training options (given as parameters is stochastic option is TRUE in training_options)
  • dimensions : dimensions for the number of views (M), groups (G), samples (N), features per view (D) and factors (K)
  • dim_red : non-linear dimension reduction manifolds.
  • cache : variance explained by factors.
  • on_disk : logical indicating whether data is loaded from disk.
  • status : auxiliary variable indicating whether the model has been trained.
  • mefisto_options, covariates, covariates_warped and interpolated_Z are optional slot to store sample covariate only for training in MEFISTO

Functions to get infos from MOFA object

Some interesting functions :

  • factors_names(): to get or set factor names
  • features_names(): to get or set feature names
  • samples_names(): to get or set sample names
  • views_names(): to get or set view names
  • get_dimensions(): to get dimensions (number of samples, features, etc.)
  • get_factors(): to get model factors
  • get_weights(): to get model weights

Check some slots of the trained MOFA model

###data
views_names(MOFAobject)
## [1] "methylation" "mirna"       "mrna"
get_dimensions(MOFAobject)
## $M
## [1] 3
## 
## $G
## [1] 1
## 
## $N
## group1 
##     87 
## 
## $D
## methylation       mirna        mrna 
##        2000         350        2000 
## 
## $K
## [1] 10
#Add sample metadata to the model
#temp_metadata <- readRDS("pd_mdd_bin.RDS",row.names = 1)
temp_metadata <- DNAm_covariates
colnames(temp_metadata)[2]="sample" #one metadata column should contains IDs of samples matching with input data and labelled as "sample"
temp_metadata <- temp_metadata[which(temp_metadata[,2] %in% colnames(MOFAobject@data$mrna$group1)),]
which(temp_metadata[,2]=="302-1")#two sample with the same number but different name in men+women cohort
## integer(0)
#temp_metadata <- temp_metadata[-95,] 
samples_metadata(MOFAobject) <- temp_metadata

Results

To interpret these data it is important to have information about the samples. Go to read the publication associated to your dataset.

General factors plots

Factors correlation plot

Correlation plot between factors with plot_factor_cor function.

Normally, they should be uncorrelated, or maybe you choose too many factors.

#Check that the Factors are uncorrelated
plot_factor_cor(MOFAobject)

Variance decomposition

Percentage of variance explained by factors in each view with the plot_variance_explained function. This is a key plot of MOFA and should always be done before inspecting factors or weights.

The latent factors inferred by MOFA represent the underlying principal sources of variation among the samples. Some of these factors may be shared across multiple data types, while others may be specific to a particular kind of data.

#variance decomposition analysis. 
plot_variance_explained(MOFAobject, max_r2=35,)

#plot_variance_explained(MOFAobject, plot_total = T)[[1]]

MOFAobject@cache$variance_explained$r2_per_factor
## $group1
##           methylation        mirna         mrna
## Factor1  0.1394995161 33.394372491 2.113047e-02
## Factor2  2.5057390115  2.737712505 1.840377e+01
## Factor3  0.0838591183 15.443605069 6.825117e-04
## Factor4  0.0049152796 15.158359650 1.428415e-03
## Factor5  0.0004624237  0.005798050 8.317380e+00
## Factor6  0.0832164718  2.758264022 2.535820e+00
## Factor7  0.4450982615  1.599998590 2.604499e+00
## Factor8  0.0058012080  0.030637060 3.778463e+00
## Factor9  0.0055396113  0.265900394 2.603466e+00
## Factor10 0.0230249807  0.004996025 2.685707e+00

In our case, Factor 2 captures a source of variability that is present across all data modalities, but a stronger association with mRNA data. Factors 1, 3 and 4 captures a very strong source of variation for the miRNA data.

You can compare those factors with RGCCA factors.

We can ask whether the model provides a good fit to the data. To check it we can plot the total variance explained (using all factors) with the plot_variance_explained function. The resulting values will depend on the nature of the data set, the number of samples, the number of factors, etc…:

  • Noisy datasets will result in a lower amounts of variance explained (<10%).
  • Higher the number of samples –> smaller the total variance explained.
  • Higher the number of factors –> higher the total variance explained.

The objective is not to reach 100%, even with an infinity of factors.

Explaining ~50% for a view using a linear model is already good.

d <- plot_variance_explained(MOFAobject, plot_total = T)[[2]]
#d$data
d

We can also access the variance explained per sample when all modalities are gaussian, to detect potential outliers with the calculate_variance_explained_per_sample function:

calculate_variance_explained(MOFAobject)
## $r2_total
## $group1
## methylation       mirna        mrna 
##    3.289975   70.306141   40.748408 
## 
## attr(,"class")
## [1] "relistable" "list"      
## 
## $r2_per_factor
## $group1
##           methylation        mirna         mrna
## Factor1  0.1394995161 33.394372491 2.113047e-02
## Factor2  2.5057390115  2.737712505 1.840377e+01
## Factor3  0.0838591183 15.443605069 6.825117e-04
## Factor4  0.0049152796 15.158359650 1.428415e-03
## Factor5  0.0004624237  0.005798050 8.317380e+00
## Factor6  0.0832164718  2.758264022 2.535820e+00
## Factor7  0.4450982615  1.599998590 2.604499e+00
## Factor8  0.0058012080  0.030637060 3.778463e+00
## Factor9  0.0055396113  0.265900394 2.603466e+00
## Factor10 0.0230249807  0.004996025 2.685707e+00
## 
## attr(,"class")
## [1] "relistable" "list"
calculate_variance_explained_per_sample(MOFAobject) #only with pure gaussian data
## $group1
##        methylation    mirna      mrna
## 105-3  0.000000000 69.42750 43.586866
## 107-1  6.082260688 70.69286 64.299878
## 109-2  4.009105852 58.85311 57.280772
## 110-4 10.316811643 66.77114 54.822651
## 111-4  0.165164961 15.27997 24.263077
## 113-3  0.746769685 48.48013 34.596442
## 118-2  0.939409479 50.93722 28.158968
## 121-3  0.908264763 77.78939 25.031449
## 123-4  3.091177490 68.63494 34.339067
## 124-2  0.562469890 35.22631 10.109843
## 125-2  2.516706693 75.75467 63.909923
## 126-4  0.487163768 79.70004 27.421631
## 127-2  5.269539954 46.24659 65.700988
## 129-4  4.332322469 53.37074 54.354735
## 130-1  3.872323800 75.75750 34.508030
## 131-2  3.961298211 45.22690 34.611914
## 132-4  0.672909071 55.15523 19.321692
## 134-4  4.634839219 70.61624 48.341780
## 136-4  1.103606268 43.33006 13.542022
## 137-4  0.430223826 56.02700 37.731950
## 139-4  0.144666618 52.60802  8.270593
## 141-4  4.372023154 82.29141 55.784593
## 145-2  1.060062536 75.06392 27.635752
## 150-4  0.000000000 66.11884 16.461139
## 151-4  6.859803275 78.09029 53.817398
## 155-4  3.996200455 87.16845 49.064566
## 157-4  1.422392676 84.10173 23.612331
## 158-4 15.014989692 68.23492 68.485506
## 159-2  1.848253778 65.93051 25.894472
## 160-4  0.693908749 73.14194 38.637444
## 161-4  0.224284360 85.79594 11.732929
## 165-4  1.066341134 83.99496 27.753205
## 167-2  1.441721251 27.11136 20.787869
## 168-4  0.709062148 24.97993 25.523180
## 169-4  2.012774688 73.99683 22.038853
## 173-3  2.506309856 25.04218 28.274502
## 174-2  0.000000000 38.48213 31.856115
## 178-3  4.211922592 82.88588 42.468062
## 180-4  3.050653310 77.70967 20.088182
## 182-4  0.308080682 87.78684 17.582271
## 184-1  1.360769448 68.71827 32.023647
## 185-4  3.493486737 57.44256 59.979772
## 187-2 13.565079515 79.73943 49.652708
## 191-4  1.013556344 74.73462 19.243730
## 192-2  0.211038558 73.38002 19.989213
## 193-2  5.340589755 88.60635 49.448742
## 194-4  0.119068511 77.26706 14.232575
## 196-4  1.658198125 56.56136 20.647978
## 198-3  6.867069551 85.87106 69.867824
## 199-2  3.200651609 76.14114 29.126129
## 205-1  1.450401417 40.25927 14.051943
## 210-1  7.167690923 58.52035 47.789523
## 211-1  0.680885868 35.96873 37.874684
## 216-1  0.464983902 76.16365 40.098209
## 222-1  8.019741037 66.58689 36.421519
## 309-1  0.758199886 85.89374 32.368011
## 314-1  1.754770851 73.90401 24.476764
## 323-1 19.618211992 72.25224 80.121107
## 404-1  2.529194965 78.14623 49.356968
## 408-1  1.280103665 54.40949 21.357967
## 503-1  0.746972316 49.80356 29.378446
## 510-1  1.098663364 67.75228 39.604248
## 511-1  0.386361702 58.71451 24.712334
## 516-1  3.228844675 82.57462 27.827413
## 603-1  2.463951941 79.79579 51.760289
## 611-1  2.021164536 80.40138 53.023440
## 620-1  8.731018497 88.39281 55.208548
## 623-1  3.206475778 61.24600 15.211590
## 629-1 12.716282294 80.20380 59.849297
## 801-1  0.376187598 81.19874 35.520633
## 802-1  0.344416784 72.86143 14.838804
## 804-1  1.214965398 77.70185 48.776182
## 806-1  1.108473515 75.33789 24.012699
## 811-1  2.361082570 44.12828 24.653452
## 817-1  0.002824111 72.69746 31.449888
## 820-1  1.143846784 57.44147 11.059384
## 822-1  0.000000000 52.46188 18.113969
## 903-1  3.820115236 46.09005 12.237532
## 904-1  2.386672921 47.91846 20.445453
## 905-1  0.604496419 75.33136 40.727142
## 906-1  1.103014232 68.42628 37.459204
## 908-1  2.440728784 58.54571 25.397719
## 909-1  4.403291762 57.94359 50.001029
## 910-1  2.821589433 30.56670 43.339803
## 911-1  0.000000000 51.03180 19.001916
## 912-1  1.899597422 42.58764 20.140715
## 914-1 27.127059853 76.65977 76.548722

Summary plot of factors

Test the association between MOFA factors and some metadata with the correlate_factors_with_covariates function.

correlate_factors_with_covariates(MOFAobject, 
  covariates = c("CD4","CD8","MO","B","NK","GR","Slide","Array","Age","BMI","Sample_Group"),
  plot="log_pval"
)

Factor2 is significantly correlated with GR, CD4, CD8, B and NK. Sample_group variable seems to be correlated with Factor3, 5 and 6.

Using those correlations, try to identify some similarities between Factors extracted by RGCCA and MOFA .

CD4 is correlated with Factor2 in the previous figure, try to identify with the plot_factor function this association. Not so obvious visually. Display factor values for each sample colored following a metadata variable.

# Plot
p <- plot_factor(MOFAobject, 
  factors = c(1:10),
  color_by = "CD4",
  dot_size = 3,        # change dot size
  dodge = T,           # dodge points with different colors
  legend = T,          # remove legend
#  add_violin = T,      # add violin plots (here we have too few points to make violin, let's do boxplots)
#  violin_alpha = 0.25  # transparency of violin plots
  add_boxplot = T
)

# The output of plot_factor is a ggplot2 object that we can edit
print(p)

Combinations of factors

You can compute the pairwise combination of factors with the plot_factors function (up to factor 7 here). You can chose a metadata variable to color samples.

plot_factors(MOFAobject, 
  factors = 1:7,
  color_by = "GR"
)

Which factors seem to be correlated ?

Explore individual factors

Useful functions

To explore loadings

  • plot_top_weights(): plot top loadings for a given factor/view
  • plot_weights(): plot all loadings for a given factor/view
  • plot_weights_heatmap(): plot a heatmap of the loadings from multiple factors of a given view

To explore data

  • plot_data_heatmap(): heatmap of the training data using only top features of a given factor
  • plot_data_scatter(): scatterplot of the data using only top features of a given factor

Functions usage are shown in the following sections.

Table of weights/factors

The weights measure how strong each feature relates to each factor :

no association = 0 ; strong association = high absolute values

The sign of the weight gives the direction of the effect (e.g. a positive weight indicates that the gene has higher levels in the cells with positive factor values).

Table of mRNA weigths

datatable(MOFAobject@expectations$W$mrna[order(MOFAobject@expectations$W$mrna[,1], decreasing = TRUE ),]) %>% formatRound(c(1,c(1:dim(MOFAobject@expectations$W$mrna)[2])))

Table of miRNA weigths

datatable(MOFAobject@expectations$W$mirna[order(MOFAobject@expectations$W$mirna[,1], decreasing = TRUE),]) %>% formatRound(c(1,c(1:dim(MOFAobject@expectations$W$mirna)[2])))

Table of Methylation weigths

datatable(MOFAobject@expectations$W$methylation[order(MOFAobject@expectations$W$methylation[,1], decreasing = TRUE),]) %>% formatRound(c(1,c(1:dim(MOFAobject@expectations$W$methylation)[2])))

Factor6

Factor 6 seems to be the more associated to Sample_Group variable (Factors 3 and 5 also but less).

Plot Factor6 values for the different samples groups

We can first check it with the plot_factor function only for Factor6.

plot_factor(MOFAobject, 
  factors = 6, 
  color_by = "Factor6",
  group_by = "Sample_Group"
) 

Factor6 is clearly associated with Sample_Group.

\(~\) \(~\)

Find top genes involved in Factor6 (mRNA view)

  • Visualize weight distribution for Factor6 with plot_weights function.
plot_weights(MOFAobject,
 view = "mrna",
 factor = 6,
 nfeatures = 10,     # Top number of features to highlight
 scale = T           # Scale weights from -1 to 1
)

  • Zoom on top 10 absolute weight for Factor6 with plot_top_weights function
plot_top_weights(MOFAobject,
 view = "mrna",
 factor = 6,
 nfeatures = 10,     # Top number of features to highlight
 scale = T           # Scale weights from -1 to 1
)

We plot the 10 genes having the highest absolute weight. The weights measure how strong each feature/gene relates to each factor (no association = 0 ; strong association = high absolute values). The sign of the weight gives the direction of the effect (e.g. a positive weight indicates that the gene has higher levels in the cells with positive factor values).

\(~\) \(~\)

Check expression level of genes involved in Factor6

We can plot a heatmap to display expression of genes with higher absolute value of the weight for Factor6 across sample with plot_data_heatmap function.

plot_data_heatmap(MOFAobject, 
  view = "mrna",
  factor = 6,  
  features = 25,
  cluster_rows = FALSE, cluster_cols = FALSE,
  show_rownames = TRUE, show_colnames = TRUE,
  denoise=TRUE,
  scale = "row"
)

Individual expression plot of the top 4 genes having positive weight with plot_data_scatter function.

plot_data_scatter(MOFAobject, 
  view = "mrna",
  factor = 6,  
  features = 4,
  sign = "positive",
  color_by = "Sample_Group",
) + labs(y="RNA expression")

\(~\) \(~\)

Individual plot of the top 4 genes having negative factor weight.

plot_data_scatter(MOFAobject, 
  view = "mrna",
  factor = 6,  
  features = 4,
  sign = "negative",
  color_by = "Sample_Group",
) + labs(y="RNA expression")

\(~\) \(~\)

Find top miRNA involved in Factor6

  • Visualize weight distribution for Factor6
plot_weights(MOFAobject,
 view = "mirna",
 factor = 6,
 nfeatures = 10,     # Top number of features to highlight
 scale = T           # Scale weights from -1 to 1
)

  • Zoom on top 10 absolute weight for Factor6
plot_top_weights(MOFAobject,
 view = "mirna",
 factor = 6,
 nfeatures = 10,     # Top number of features to highlight
 scale = T           # Scale weights from -1 to 1
)

We plot the 10 miRNA having the highest absolute weight. The weights measure how strong each feature/methylation site relates to each factor (no association = 0 ; strong association = high absolute values). The sign of the weight gives the direction of the effect (e.g. a positive weight indicates that the miRNA levels in the sample with positive factor values).

We can plot heatmap also.

plot_data_heatmap(MOFAobject, 
  view = "mirna",
  factor = 6,  
  features = 25,
  cluster_rows = FALSE, cluster_cols = FALSE,
  show_rownames = TRUE, show_colnames = TRUE,
  scale = "row"
)

Individual plot of the top 4 miRNA having positive factor values

plot_data_scatter(MOFAobject, 
  view = "mirna",
  factor = 6,  
  features = 4,
  sign = "positive",
  color_by = "Sample_Group",
) + labs(y="mirna")

\(~\) \(~\)

Individual plot of the top 4 miRNA having negative factor values

plot_data_scatter(MOFAobject, 
  view = "mirna",
  factor = 6,  
  features = 4,
  sign = "negative",
  color_by = "Sample_Group",
) + labs(y="mirna")

You can do the same for methylation view, keeping in mind that methylation view is not well represented by the 10 factors.